/* File: make_formulary_data.do
 * Author: Yunjuan Liu, Luca Maini
 * Purpose: Creates the dataset with the formulary data
 *
 * Date Created: 03/19/2021
 *
 */

 /*

////////////////////////////////////////
////								////
////	STEP 1. Import raw files	////
////								////
////////////////////////////////////////

* There are several files we loop through
local filelist "DRUGS Medispan_Bridge NDC_Bridge PLANSCONTROLLER PLANSCONTROLLERMCO RESTRICTIONS STATES STATUSES"
local sufflist "allyears additional 2011_jan23 2012_jan23 2013_jan23 2017_jan23 jan23_addon1 jan23_addon2"

foreach file of local filelist {
	forvalues year = 2011/2021 {
		import delimited using "${inputdir}\MMITDataFeed2.0_`year'\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear
		
		tempfile temp`year'
		save "`temp`year''", replace
		
		}

	clear
	forvalues year = 2011/2021 {
		append using "`temp`year''"
		}
	
	save "${outdir}\\`file'_allyears.dta", replace
	}

* now do the "additional drug" pull
foreach file of local filelist {

	import delimited using "${inputdir}\Additional Drugs\\`file'.txt", ///
		delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_additional.dta", replace
	}


* now do the third and fourth pull from December 2022 and February 2023
foreach file of local filelist {
	import delimited using ///
		"${inputdir}\January 2023 data pull\MMITDataFeed2.0_12_19_2022_012011-122011\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_2011_jan23.dta", replace

	import delimited using ///
		"${inputdir}\January 2023 data pull\MMITDataFeed2.0_12_19_2022_012012-122012\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_2012_jan23.dta", replace

	import delimited using ///
		"${inputdir}\January 2023 data pull\MMITDataFeed2.0_12_20_2022_012013-122016\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_2013_jan23.dta", replace

	import delimited using ///
		"${inputdir}\January 2023 data pull\MMITDataFeed2.0_12_20_2022_012017-122019\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_2017_jan23.dta", replace
	
	import delimited using ///
		"${inputdir}\January 2023 data pull\MMITDataFeed2.0_02_02_2023_012011-122019\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_jan23_addon1.dta", replace

	import delimited using ///
		"${inputdir}\January 2023 data pull\MMITDataFeed2.0_02_14_2023_012020-122022\\`file'.txt", ///
			delimiter("|") varnames(1) stringcols(_all) clear

	save "${outdir}\\`file'_jan23_addon2.dta", replace
	
	}

	
* Append before cleaning files since some variables are encoded (this ensures 
* consistent encoding across files)
local filelist "DRUGS Medispan_Bridge NDC_Bridge PLANSCONTROLLER PLANSCONTROLLERMCO RESTRICTIONS STATES STATUSES"
// local filelist "STATUSES"
local sufflist "allyears additional 2011_jan23 2012_jan23 2013_jan23 2017_jan23 jan23_addon1 jan23_addon2"

foreach file of local filelist {
	clear
	
	foreach suff of local sufflist {
		
		append using "${outdir}\\`file'_`suff'.dta"
		
		* create pull variable
		if "`suff'" == "allyears" gen pull = 1
		else if "`suff'" == "additional" replace pull = 2 if pull == .
		else if regexm("`suff'", "jan23_addon") replace pull = 4 if pull == .
		else replace pull = 3 if pull == .
		}
	
	* clean the file
	do "${dodir}\support scripts\cln_`file'.do"

	* save
	save "${outdir}/cln_`file'_allyears.dta", replace
	}

*** Add lives file with total US lives
use "${outdir}\cln_STATES_allyears.dta" 
collapse (sum) lives, by(planid m)
save "${outdir}\cln_LIVES_allyears.dta"


////////////////////////////////////////////////////////////////
////														////
////	STEP 2. Build yearly datasets for individual drugs	////
////														////
////////////////////////////////////////////////////////////////

/*	Each final dataset is identified by:
	
		- Organization ID
		- Formulary ID
		- Medication ID (drug)
		- Period (month)

	The main issue is in merging the Plan-formulary-level info (which has data 
	on channel, lives, PBM relationships, etc.) to the data at the 
	formulary-drug level (which has info on coverage, etc.). This is an m:m 
	match because the same formulary is used by multiple organizations, and also
	the same formulary has multiple drugs. So we create separate datasets for
	each drug and then append them. This takes more time, but ensures that we 
	don't have to do an m:m merge.

*/

*** STEP 1. Build the data on the organization-plan side
use "${outdir}/cln_STATES_allyears.dta", clear

* for now, collapse state-level variation
collapse (sum) lives, by(m planid)

* open and append plan-level info (use MCO data)
merge 1:1 m planid using "${outdir}/cln_PLANSCONTROLLERMCO_allyears.dta", ///
	gen(lives_merge)

// NOTES: lives info is missing for ~6.5% of observations in plans controller

* save as a temporary file
tempfile planData
save "`planData'", replace

*** STEP 2. Build list of drugs
use "${outdir}/cln_DRUGS_allyears.dta", clear
keep medid drugname

bysort medid (drugname) : keep if _n == 1

tempfile drugnames
save `drugnames', replace

// drop if medid == 596429	// RETACRIT is available in both pull 1 and 2, with two different medid values. We keep the latest pull
// drop if medid == 154994	// GLUCAGON is available in both pull 2 and 3, with two different medid values, but the 154994 medid has no data.

levelsof medid, local(idList)
clear

*** STEP 3. Create temporary datasets with formulary info for all drugs
foreach id of local idList {
	
	* open and save drug-specific datasets
	use "${outdir}/cln_STATUSES_allyears.dta", clear
	keep if medid == `id'
	tempfile statuses
	save `statuses', replace

	use "${outdir}/cln_RESTRICTIONS_allyears.dta", clear
	keep if medid == `id'
	tempfile restrictions
	save `restrictions', replace
	
	* merge all together
	use `drugnames', clear
	keep if medid == `id'
	
	merge 1:m medid using `statuses', keep(match) nogen
	
	merge 1:1 formularyid medid m using `restrictions', gen(restrictions_merge)
	
	* merge with the plans data
	merge 1:m formularyid m using `planData', keep(master match) gen(plan_merge)
	
	* Drop instances where the drugname is missing (these are observations missing plan data)
	drop if drugname == ""
	
	* save as a single-id dataset
	save "${outdir}\single_drugs\\drug_`id'.dta", replace
	}

	
////////////////////////////////////////////////////////////
////													////
////	STEP 3. Build the yearly dataset of all drugs	////
////													////
////////////////////////////////////////////////////////////

* STEP 1: build list of drugs
use "${outdir}/cln_DRUGS_allyears.dta", clear
levelsof medid, local(drugList)
clear


*** STEP 2: loop over each drug, collapse at the quarter level, then append all 
*			files

foreach id of local drugList {

	use "${outdir}\single_drugs\drug_`id'.dta", clear
	
	* drop variables we are not going to use
	drop universalstatusrollup rawstatus ///
		controllerid mcoid parentid pbmid benefitdesign ///
		notlistedpolicy nonformularypolicy ///
		lives_merge plan_merge restrictions_merge
	
	* keep data for the first month in each given quarter. The command month 
	* returns a number between 1 and 12, and must take a date format as input, 
	* so we add dofm() to transform m from monthly to daily.
	
	* EDIT 09/05/2025: we switched from using the first day of the month to 
	*				   using the mid-quarter month. This makes the quarterly 
	*				   file more consistent with the yearly file (which uses the
	*				   February formulary).

	keep if month(dofm(m)) == 2 | ///
			month(dofm(m)) == 5 | ///
			month(dofm(m)) == 8 | ///
			month(dofm(m)) == 11
	
	gen quarter = qofd(dofm(m))
	format quarter %tq
	drop m
	
	* Drop PBM Offerin plans in the first 6 months of 2014 (as instructed by 
	* MMIT, these plans lead to double-counted lives)
	drop if plantype == "PBM OFFERING":plantype & ///
		quarter >= tq(2014q1) & quarter <= tq(2014q2)
	
	* generate indicator for PA/ST restriction
	gen PA_ST = regexm(restrictioncodes, "PA") == 1 | ///
				regexm(restrictioncodes, "ST") == 1 | ///
				universalstatus == "PRIOR AUTHORIZATION REQUIRED":universalstatuslbl	// these are occasionally contradicting variables

	drop restrictioncodes
	
	* run GLP algorithm
	drop if lives == 0	// these formularies don't matter because they don't cover anyone
	run "${dodir}\support scripts\assign_geruso.do"
	keep drugname medid planid formularyid lives channel plantype ///
		pbmrelationship quarter geruso PA_ST nonpreferred
	
	* save in preparation for appending
	compress
	save "${outdir}\single_drugs\drug_`id'_quarterly.dta", replace
	
	
	*** Now keep the first quarter to make it a yearly file
	gen year = yofd(dofq(quarter))
	gen keep = quarter(dofq(quarter)) == 1
	
	* Exception: keep also first observation of the first year, if it is after 
	* February (to keep drugs that launch during the year, but after February)
	qui sum quarter
	local firstQ = r(min)
	replace keep = 1 if quarter == `firstQ' & quarter(dofq(`firstQ')) > 1
	
	drop if keep == 0
	drop quarter keep
	
	* save in preparation for appending
	compress
	save "${outdir}\single_drugs\drug_`id'_yearly.dta", replace
	}

* Append
clear
foreach id of local drugList {
	qui append using "${outdir}\single_drugs\drug_`id'_quarterly.dta"
	}

compress
save "${outdir}\allDrugs_quarterly.dta", replace

clear
foreach id of local drugList {
	qui append using "${datadir}\single_drugs\drug_`id'_yearly.dta"
	}

compress

* we drop the lives variable and re-merge it in because drugs launched after 
* February may have a different "lives" variable in that year.
drop lives

tempfile yearly
save `yearly', replace

use "${outdir}\cln_STATES_allyears.dta"
collapse (sum) lives, by(planid m)
keep if month(dofm(m)) == 2
gen year = yofd(dofm(m))
drop m

merge 1:m planid year using `yearly', keep(match) nogen	// we drop plans for which we don't have lives information

save "${outdir}\allDrugs_yearly.dta", replace


*/

////////////////////////////////////////
////								////
////	STEP 4. Merge to SSR data	////
////								////
////////////////////////////////////////

* Open data, and merge to SSR crosswalk
use "${outdir}\allDrugs_yearly.dta", clear

* merge with the SSR-to-MMIT name walkthrough
rename drugname MMIT_product
merge m:1 MMIT_product using "${inputdir}/MMIT_SSR_crosswalk.dta", keep(master match) nogen
drop MMIT_product
rename SSR_product Product

drop if Product == ""	// these are drugs for which we don't have sales 
						// data and should all be biosimilars

* collapse to Product-quarter level (take max of geruso across all MMIT products 
* matched to SSR Health)
collapse (max) geruso, by(formularyid planid lives channel pbmrelationship year Product)

* rename geruso to shorten it
rename geruso glp

* Create coverage indicators
gen frcov = glp >= 2
gen frunr = glp >= 3
gen frpre = glp >= 7

foreach var in frcov frunr frpre glp {

	* National formularies only
	gen `var'_nat = `var' if pbmrelationship == "NATIONAL":pbmrelationship

	* Commercial formularies only
	gen `var'_com = `var' if channel ==  "COMMERCIAL":channel

	}

* Keep separate variables by channel
collapse frcov* frunr* frpre* glp* [fweight=lives], by(Product year)

* save
keep Product year frcov* frunr* frpre* glp*
compress
save "${outdir}\formulary_data_cln_year-drug.dta", replace
